# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
# setting max number of columns to be displayed
pd.set_option('display.max_columns',250)
pd.set_option('display.max_rows',250)
# Supress Warnings
warnings.filterwarnings('ignore')
# lets import the dataset
telecom = pd.read_csv("telecom_churn_data.csv")
telecom.head()
# Let's check the dimensions of the dataframe
telecom.shape
telecom.describe(include='all')
telecom.info()
There are 99999 rows and 226 columns in the data. Lot of the columns are numeric type, but we need to inspect which are the categorical columns.
# create column name list by types of columns
id_cols = ['mobile_number', 'circle_id']
date_cols = ['last_date_of_month_6',
'last_date_of_month_7',
'last_date_of_month_8',
'last_date_of_month_9',
'date_of_last_rech_6',
'date_of_last_rech_7',
'date_of_last_rech_8',
'date_of_last_rech_9',
'date_of_last_rech_data_6',
'date_of_last_rech_data_7',
'date_of_last_rech_data_8',
'date_of_last_rech_data_9'
]
cat_cols = ['night_pck_user_6',
'night_pck_user_7',
'night_pck_user_8',
'night_pck_user_9',
'fb_user_6',
'fb_user_7',
'fb_user_8',
'fb_user_9'
]
num_cols = [column for column in telecom.columns if column not in id_cols + date_cols + cat_cols]
# print the number of columns in each list
print("#ID cols: %d\n#Date cols:%d\n#Numeric cols:%d\n#Category cols:%d" % (len(id_cols), len(date_cols), len(num_cols), len(cat_cols)))
# check if we have missed any column or not
print(len(id_cols) + len(date_cols) + len(num_cols) + len(cat_cols) == telecom.shape[1])
# look at missing value ratio in each column
telecom.isnull().sum()*100/telecom.shape[0]### Impute missing values#### Imputing with zeroes
# some recharge columns have minimum value of 1 while some don't
recharge_cols = ['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9',
'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_2g_9',
'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'count_rech_3g_9',
'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9',
'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9',
]
telecom[recharge_cols].describe(include='all')
# It is also observed that the recharge date and the recharge value are missing together which means the customer didn't recharge
telecom.loc[telecom.total_rech_data_6.isnull() & telecom.date_of_last_rech_data_6.isnull(), ["total_rech_data_6", "date_of_last_rech_data_6"]].head(20)
In the recharge variables where minumum value is 1, we can impute missing values with zeroes since it means customer didn't recharge their numbere that month.
# Create a list of recharge columns where we will impute missing values with zeroes
zero_impute = ['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9',
'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9',
'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9'
]
# Impute missing values with 0
telecom[zero_impute] = telecom[zero_impute].apply(lambda x: x.fillna(0))
# let's make sure values are imputed correctly
print("Missing value ratio:\n")
print(telecom[zero_impute].isnull().sum()*100/telecom.shape[1])
# summary
print("\n\nSummary statistics\n")
print(telecom[zero_impute].describe(include='all'))
# Drop id and date columns
print("Shape before dropping: ", telecom.shape)
telecom = telecom.drop(id_cols + date_cols, axis=1)
print("Shape after dropping: ", telecom.shape)
We will replace missing values in the categorical values with '-1' where '-1' will be a new category
# Replace missing values with '-1' in categorical columns
telecom[cat_cols] = telecom[cat_cols].apply(lambda x: x.fillna(-1))
# missing value ratio
print("Missing value ratio:\n")
print(telecom[cat_cols].isnull().sum()*100/telecom.shape[0])
initial_cols = telecom.shape[1]
MISSING_THRESHOLD = 0.7
include_cols = list(telecom.apply(lambda column: True if column.isnull().sum()/telecom.shape[0] < MISSING_THRESHOLD else False))
drop_missing = pd.DataFrame({'features':telecom.columns , 'include': include_cols})
drop_missing.loc[drop_missing.include == True,:]
# Drop columns
telecom = telecom.loc[:, include_cols]
dropped_cols = telecom.shape[1] - initial_cols
print("{0} columns dropped.".format(dropped_cols))
# Imputing the respective null columns as 0.
telecom[telecom.select_dtypes(exclude=['datetime64[ns]','category']).columns.tolist()] = telecom[telecom.select_dtypes(exclude=['datetime64[ns]','category']).columns.tolist()].fillna(0, axis=1)
# Calculate the total data recharge amount for June and July --> number of recharges * average recharge amount
telecom['total_data_rech_6'] = telecom.total_rech_data_6 * telecom.av_rech_amt_data_6
telecom['total_data_rech_7'] = telecom.total_rech_data_7 * telecom.av_rech_amt_data_7
# Calculate total recharge amount for June and July --> call recharge amount + data recharge amount
telecom['amt_data_6'] = telecom.total_rech_amt_6 + telecom.total_data_rech_6
telecom['amt_data_7'] = telecom.total_rech_amt_7 + telecom.total_data_rech_7
# Average recharge done by customer in June and July
telecom['av_amt_data_6_7'] = (telecom.amt_data_6 + telecom.amt_data_7)/2
# look at the 70th percentile recharge amount
print("Recharge amount at 70th percentile: {0}".format(telecom.av_amt_data_6_7.quantile(0.7)))
# Retain only those customers who have recharged their mobiles with more than or equal to 70th percentile amount
telecom_filtered = telecom.loc[telecom.av_amt_data_6_7 >= telecom.av_amt_data_6_7.quantile(0.7), :]
telecom_filtered = telecom_filtered.reset_index(drop=True)
telecom_filtered.shape
# Delete variables created to filter high-value customers
telecom_filtered = telecom_filtered.drop(['total_data_rech_6', 'total_data_rech_7',
'amt_data_6', 'amt_data_7', 'av_amt_data_6_7'], axis=1)
telecom_filtered.shape
telecom_filtered.info()
# Churn Derivation
# calculate total incoming and outgoing minutes of usage
telecom_filtered['total_calls_mou_9'] = telecom_filtered.total_ic_mou_9 + telecom_filtered.total_og_mou_9
# calculate 2g and 3g data consumption
telecom_filtered['total_internet_mb_9'] = telecom_filtered.vol_2g_mb_9 + telecom_filtered.vol_3g_mb_9
# create churn variable: those who have not used either calls or internet in the month of September are customers who have churned
# 0 - not churn, 1 - churn
telecom_filtered['churn'] = telecom_filtered.apply(lambda row: 1 if (row.total_calls_mou_9 == 0 and row.total_internet_mb_9 == 0) else 0, axis=1)
# verify that the churn is tagged correctly
telecom_filtered[['churn','total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']]
#After defining the y variable by tagging churners delete the columns having _9 attribute in their name
#After tagging churners, remove all the attributes corresponding to the churn phase
#(all attributes having ‘ _9’, etc. in their names).
#Get the list of columns which has name ends with _9 attribute
# find the columns which has name ends with _9 attribute
month_9_cols = telecom_filtered.columns[telecom_filtered.columns.str.endswith(pat = '_9')]
print("The columns names ends with _9 are given below...")
month_9_cols
# so, deleting derived variables
telecom_filtered = telecom_filtered.drop(['total_calls_mou_9', 'total_internet_mb_9'], axis=1)
# change data type to category
telecom_filtered.churn = telecom_filtered.churn.astype("category")
# print churn ratio
print("Churn Ratio:")
print(telecom_filtered.churn.value_counts()*100/telecom_filtered.shape[0])
# We see that approx. 8.1% of high value customers have churned
#calculate difference variable as the difference between 8th month and the average of 6th and 7th month.
telecom_filtered['arpu_diff'] = telecom_filtered.arpu_8 - ((telecom_filtered.arpu_6 + telecom_filtered.arpu_7)/2)
telecom_filtered['onnet_mou_diff'] = telecom_filtered.onnet_mou_8 - ((telecom_filtered.onnet_mou_6 + telecom_filtered.onnet_mou_7)/2)
telecom_filtered['offnet_mou_diff'] = telecom_filtered.offnet_mou_8 - ((telecom_filtered.offnet_mou_6 + telecom_filtered.offnet_mou_7)/2)
telecom_filtered['roam_ic_mou_diff'] = telecom_filtered.roam_ic_mou_8 - ((telecom_filtered.roam_ic_mou_6 + telecom_filtered.roam_ic_mou_7)/2)
telecom_filtered['roam_og_mou_diff'] = telecom_filtered.roam_og_mou_8 - ((telecom_filtered.roam_og_mou_6 + telecom_filtered.roam_og_mou_7)/2)
telecom_filtered['loc_og_mou_diff'] = telecom_filtered.loc_og_mou_8 - ((telecom_filtered.loc_og_mou_6 + telecom_filtered.loc_og_mou_7)/2)
telecom_filtered['std_og_mou_diff'] = telecom_filtered.std_og_mou_8 - ((telecom_filtered.std_og_mou_6 + telecom_filtered.std_og_mou_7)/2)
telecom_filtered['isd_og_mou_diff'] = telecom_filtered.isd_og_mou_8 - ((telecom_filtered.isd_og_mou_6 + telecom_filtered.isd_og_mou_7)/2)
telecom_filtered['spl_og_mou_diff'] = telecom_filtered.spl_og_mou_8 - ((telecom_filtered.spl_og_mou_6 + telecom_filtered.spl_og_mou_7)/2)
telecom_filtered['total_og_mou_diff'] = telecom_filtered.total_og_mou_8 - ((telecom_filtered.total_og_mou_6 + telecom_filtered.total_og_mou_7)/2)
telecom_filtered['loc_ic_mou_diff'] = telecom_filtered.loc_ic_mou_8 - ((telecom_filtered.loc_ic_mou_6 + telecom_filtered.loc_ic_mou_7)/2)
telecom_filtered['std_ic_mou_diff'] = telecom_filtered.std_ic_mou_8 - ((telecom_filtered.std_ic_mou_6 + telecom_filtered.std_ic_mou_7)/2)
telecom_filtered['isd_ic_mou_diff'] = telecom_filtered.isd_ic_mou_8 - ((telecom_filtered.isd_ic_mou_6 + telecom_filtered.isd_ic_mou_7)/2)
telecom_filtered['spl_ic_mou_diff'] = telecom_filtered.spl_ic_mou_8 - ((telecom_filtered.spl_ic_mou_6 + telecom_filtered.spl_ic_mou_7)/2)
telecom_filtered['total_ic_mou_diff'] = telecom_filtered.total_ic_mou_8 - ((telecom_filtered.total_ic_mou_6 + telecom_filtered.total_ic_mou_7)/2)
telecom_filtered['total_rech_num_diff'] = telecom_filtered.total_rech_num_8 - ((telecom_filtered.total_rech_num_6 + telecom_filtered.total_rech_num_7)/2)
telecom_filtered['total_rech_amt_diff'] = telecom_filtered.total_rech_amt_8 - ((telecom_filtered.total_rech_amt_6 + telecom_filtered.total_rech_amt_7)/2)
telecom_filtered['max_rech_amt_diff'] = telecom_filtered.max_rech_amt_8 - ((telecom_filtered.max_rech_amt_6 + telecom_filtered.max_rech_amt_7)/2)
telecom_filtered['total_rech_data_diff'] = telecom_filtered.total_rech_data_8 - ((telecom_filtered.total_rech_data_6 + telecom_filtered.total_rech_data_7)/2)
telecom_filtered['max_rech_data_diff'] = telecom_filtered.max_rech_data_8 - ((telecom_filtered.max_rech_data_6 + telecom_filtered.max_rech_data_7)/2)
telecom_filtered['av_rech_amt_data_diff'] = telecom_filtered.av_rech_amt_data_8 - ((telecom_filtered.av_rech_amt_data_6 + telecom_filtered.av_rech_amt_data_7)/2)
telecom_filtered['vol_2g_mb_diff'] = telecom_filtered.vol_2g_mb_8 - ((telecom_filtered.vol_2g_mb_6 + telecom_filtered.vol_2g_mb_7)/2)
telecom_filtered['vol_3g_mb_diff'] = telecom_filtered.vol_3g_mb_8 - ((telecom_filtered.vol_3g_mb_6 + telecom_filtered.vol_3g_mb_7)/2)
telecom_filtered['total_og_mou_diff'].describe()
# delete all variables relating to 9th month
telecom_filtered = telecom_filtered.filter(regex='[^9]$', axis=1)
telecom_filtered.shape
# extract all names that end with 9
col_9_names = telecom.filter(regex='9$', axis=1).columns
# update num_cols and cat_cols column name list
cat_cols = [col for col in cat_cols if col not in col_9_names]
cat_cols.append('churn')
num_cols = [col for col in telecom_filtered.columns if col not in cat_cols]
# change column types
telecom_filtered[num_cols] = telecom_filtered[num_cols].apply(pd.to_numeric)
telecom_filtered[cat_cols] = telecom_filtered[cat_cols].apply(lambda column: column.astype("category"), axis=0)
plt.figure(figsize=(20,20))
sns.heatmap(telecom_filtered.corr(), annot=True)
As we can see the above plot is not very easy to understand let's check the numerical value of correlation between variables and find out if there are features with high correlation,then we can use PCA
# lets check the correlation amongst the features
cor = telecom_filtered.corr()
cor.loc[:,:] = np.tril(cor, k=-1)
cor = cor.stack()
cor.sort_values(ascending=False)
There is very high correlation between many variables, so there is a scope of dimensionality reduction, we would take a look at it in the next segment.
Let us create some utility methods to ease the EDA process.
# Method for numerical data to plot distribution plots (Univariate Analysis)
def distributionPlot(feature):
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sns.distplot(feature, norm_hist = True, ax = ax1)
sns.boxplot(feature, orient = 'v', ax = ax2)
# Reusable method to plot scatter plot (For Bivariate Analysis)
def scatterPlot(y, x):
plt.figure(figsize=(10,8))
sns.scatterplot(x = x, y = y)
Let's try to do some univariate analysis
sns.countplot(telecom_filtered.churn)
This clearly shows that there is high class imbalance between the churn and non-churn as amount of churning customers is very less in comparison to the non-churning ones.
distributionPlot(telecom_filtered.arpu_6)
The field arpu_6 outliers due to which the distribution graph is also skewedwhich means it's mean & median would be greater than it's mode.
Let's check a few more fields
distributionPlot(telecom_filtered.total_rech_amt_8)
It is weird that there are some transactions crossing the 40000 mark. It might be a wrong data entry or someone with very high usage of phone.
Let's check the age on numbers distribution and see how the customers are distributed to different life-spans.
distributionPlot(telecom_filtered.onnet_mou_8)
Let's try to do some Bivariate analysis
help(scatterPlot)
# lets now draw a scatter plot between total recharge and avg revenue for the 8th month
scatterPlot(telecom_filtered.total_rech_num_8, telecom_filtered.arpu_8)
sns.boxplot(x = telecom_filtered.churn, y = telecom_filtered.sep_vbc_3g)
sns.boxplot(x = telecom_filtered.churn, y = telecom_filtered.aon)
sns.boxplot(x = telecom_filtered.churn, y = telecom_filtered.spl_ic_mou_diff)
sns.boxplot(x = telecom_filtered.churn, y = telecom_filtered.total_ic_mou_diff)
sns.boxplot(x = telecom_filtered.churn, y = telecom_filtered.total_og_mou_8)
sns.boxplot(x = telecom_filtered.churn, y = np.log(telecom_filtered.max_rech_amt_8))
This means that when customers are about to churn they reduce the amount spend on the recharge as they might just want to cope up to the time they churn out.
# churn vs max recharge amount
plt.figure(figsize=(12,4))
ax = sns.kdeplot(telecom_filtered.max_rech_amt_8[(telecom_filtered["churn"] == 0)],
color="Red", shade = True)
ax = sns.kdeplot(telecom_filtered.max_rech_amt_8[(telecom_filtered["churn"] == 1)],
ax =ax, color="Blue", shade= True)
# create function to anlyze the features across 6th, 7th and 8th month
def analyze_feature(feature_start):
plt.rcParams["figure.figsize"] = [17, 7]
fig = plt.figure()
print("Churn Stats (mean and standard deviation):")
cols = {c: feature_start + "_" + str(c) for c in [6,7,8]}
print(cols)
frame_data = []
[frame_data.append({
"mean_6": telecom_filtered[telecom_filtered["churn"] == churn][cols[6]].mean(),
"mean_7": telecom_filtered[telecom_filtered["churn"] == churn][cols[7]].mean(),
"mean_8": telecom_filtered[telecom_filtered["churn"] == churn][cols[8]].mean(),
"std_6": telecom_filtered[telecom_filtered["churn"] == churn][cols[6]].std(),
"std_7": telecom_filtered[telecom_filtered["churn"] == churn][cols[7]].std(),
"std_8": telecom_filtered[telecom_filtered["churn"] == churn][cols[8]].std()
}) for churn in [0,1]]
f,axes = plt.subplots(nrows=1, ncols=3)
sns.boxplot(data=telecom_filtered, y=feature_start + "_6", x="churn",
hue="churn", linewidth=2.5, showfliers=False, ax=axes[0])
sns.boxplot(data=telecom_filtered, y=feature_start + "_7", x="churn",
hue="churn", linewidth=2.5, showfliers=False, ax=axes[1])
sns.boxplot(data=telecom_filtered, y=feature_start + "_8", x="churn",
hue="churn", linewidth=2.5, showfliers=False, ax=axes[2])
return pd.DataFrame(frame_data,index={"Non Churned","Churned"}).round(2)
analyze_feature('total_rech_amt')
recharge_amnt_columns = telecom_filtered.columns[telecom_filtered.columns.str.contains('rech_amt')]
recharge_amnt_columns
# let's analyze total recharge amount for data:
analyze_feature('max_rech_amt')
pd.crosstab(telecom_filtered.churn, telecom_filtered.night_pck_user_8, normalize='columns')*100
pd.crosstab(telecom_filtered.churn, telecom_filtered.sachet_3g_8)
def cap_outliers(array, k=3):
upper_limit = array.mean() + k*array.std()
lower_limit = array.mean() - k*array.std()
array[array<lower_limit] = lower_limit
array[array>upper_limit] = upper_limit
return array
# example of capping
sample_array = list(range(100))
# add outliers to the data
sample_array[0] = -9999
sample_array[99] = 9999
# cap outliers
sample_array = np.array(sample_array)
print("Array after capping outliers: \n", cap_outliers(sample_array, k=2))
# cap outliers in the numeric columns
telecom_filtered[num_cols] = telecom_filtered[num_cols].apply(cap_outliers, axis=0)
telecom_filtered.describe(percentiles=[0.01, 0.10,.25,.5,.75,.90,.95,.99])
If we look the pattern followed by the customers on spending in the two categories churning and non-churning, it is clearly visible that both of them follow every different types of spending and such behaviour can be an indicator of churning vs non-churning.
Let's start the model builing but before that, let's divide our dataset into traning and testing set.
#Let's separate the predicted and predictor variables
X = telecom_filtered.drop(['churn'], axis=1)
y = telecom_filtered['churn']
# Let us scale our dataset as the units are very varying
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
In this step we would create multiple models and try to find the best model among them. Also, we would use PCA for dimensionality reduction
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import tree
(telecom_filtered.isnull().sum()*100/telecom_filtered.shape[0]).sort_values(ascending=False)
#Getting PCA object
pca = Pipeline([('scaler', StandardScaler()), ('pca', PCA())])
pca.fit(X_train)
churn_pca = pca.fit_transform(X_train)
# extract pca model from pipeline
pca = pca.named_steps['pca']
# look at explainded variance of PCA components
print(pd.Series(np.round(pca.explained_variance_ratio_.cumsum(), 4)*100))
We can see that 55 Principal Components are able to explain ~90% of the variance.
Similarly, 75 Principal Components are able to explain ~95% of the variance.
Let's visualize this using the Scree plot
# Scree plot
features = range(pca.n_components_)
cumulative_variance = np.round(np.cumsum(pca.explained_variance_ratio_)*100, decimals=4)
plt.figure(figsize=[10,6])
plt.plot(cumulative_variance)
Let us finalize the PC components to 55 and try it with various models such as Logistic Regression, Decision Trees etc.
Let us create a pipeline for performing the PCA followed by logistic regression.
# create pipeline
PCA_VARS = 55
steps = [('scaler', StandardScaler()),
("pca", PCA(n_components=PCA_VARS)),
("logistic", LogisticRegression(class_weight='balanced'))
]
pipeline = Pipeline(steps)
# fit model
pipeline.fit(X_train, y_train)
# check score on train data
pipeline.score(X_train, y_train)
def printScores(y_test, y_pred):
cm = confusion_matrix(y_test, y_pred)
print(cm)
sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity : ', sensitivity )
specificity = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity : ', specificity)
# predict churn on test data
y_pred = pipeline.predict(X_test)
printScores(y_test, y_pred)
# check area under curve
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
print("AUC:", round(roc_auc_score(y_test, y_pred_prob),2))
As it is mentioned in the problem statement we need to identify the churns correctly, that means recall/sensitivity is what we want to maximize, as a result we need to have high AUC value for that.
# class imbalance
y_train.value_counts()/y_train.shape
# PCA
pca = PCA()
# logistic regression - the class weight is used to handle class imbalance - it adjusts the cost function
logistic = LogisticRegression(class_weight={0:0.06, 1: 0.94})
# create pipeline
steps = [("scaler", StandardScaler()),
("pca", pca),
("logistic", logistic)
]
# compile pipeline
pca_logistic = Pipeline(steps)
# hyperparameter space
params = {'pca__n_components': [55, 75], 'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}
# create 5 folds
folds = 5
# create gridsearch object
model = GridSearchCV(estimator=pca_logistic, cv=folds, param_grid=params, scoring='recall', return_train_score=True, n_jobs=-1, verbose=1)
# fit model
model.fit(X_train, y_train)
# cross validation results
pd.DataFrame(model.cv_results_)
# Final hyperparamters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)
# predict churn on test data
y_pred = model.predict(X_test)
printScores(y_test, y_pred)
# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC:", round(roc_auc_score(y_test, y_pred_prob),2))
Let us build predictive model using Random Forest and PCA. Although, we don't need to worry about dimensionality reduction in Random Forest but adding PCA might help in faster convergence.
# PCA
pca = PCA()
# Class imbalance fix
forest = RandomForestClassifier(class_weight={0:0.06, 1: 0.94})
# create pipeline
steps = [("scaler", StandardScaler()),
("pca", pca),
("random_forest", forest)
]
# compile pipeline
pca_random = Pipeline(steps)
# hyperparameter space
params = {'pca__n_components': [55, 75], "random_forest__criterion": ['gini', 'entropy'], "random_forest__max_features": ['auto', 'log2', 0.4], "random_forest__min_samples_leaf" : [20, 30, 40, 50, 100]}
# create 5 folds
folds = 5
# create gridsearch object
model = GridSearchCV(estimator=pca_random, cv=folds, param_grid=params, scoring='roc_auc', return_train_score=True, n_jobs=-1, verbose=1)
# fit model
model.fit(X_train, y_train)
# Final hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)
# predict churn on test data
y_pred = model.predict(X_test)
printScores(y_test, y_pred)
# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC: ", round(roc_auc_score(y_test, y_pred_prob),2))
# predictors
features = telecom_filtered.drop('churn', axis=1).columns
# feature_importance
importance = rf_model.feature_importances_
# create dataframe
feature_importance = pd.DataFrame({'variables': features, 'importance_percentage': importance*100})
feature_importance = feature_importance[['variables', 'importance_percentage']]
# sort features
feature_importance = feature_importance.sort_values('importance_percentage', ascending=False).reset_index(drop=True)
print("Sum of importance=", feature_importance.importance_percentage.sum())
feature_importance
learner_pca = LogisticRegression(class_weight='balanced')
learner_pca.fit(X_train,y_train)
#Predict on training set
dtrain_predictions = learner_pca.predict(X_train)
dtrain_predprob = learner_pca.predict_proba(X_train)[:,1]
print ("Accuracy :",metrics.roc_auc_score(y_train, dtrain_predictions))
print ("Recall/Sensitivity :",metrics.recall_score(y_train, dtrain_predictions))
print ("AUC Score (Train):",metrics.roc_auc_score(y_train, dtrain_predprob))
Let us build predictive model using Decision Tree and PCA.
# PCA
pca = PCA()
# Class imbalance fix
dc = DecisionTreeClassifier(class_weight={0:0.06, 1: 0.94})
# create pipeline
steps = [("scaler", StandardScaler()),
("pca", pca),
("dc", dc)
]
# compile pipeline
pca_dc = Pipeline(steps)
# hyperparameter space
params = {'pca__n_components': [55, 75], "dc__criterion": ['gini', 'entropy'], "dc__max_features": ['auto', 'log2', 0.4], "dc__max_depth": [2, 3, 4], "dc__min_samples_leaf" : [20, 30, 40, 50, 100]}
# create 5 folds
folds = 5
# create gridsearch object
model = GridSearchCV(estimator=pca_dc, cv=folds, param_grid=params, scoring='roc_auc', return_train_score=True, n_jobs=-1, verbose=1)
# fit model
model.fit(X_train, y_train)
# Final hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)
Although, we got the most optimized at max_depth at 4, but it can get complex and might overfit. Let's try with 3.
# predict churn on test data
dc = DecisionTreeClassifier(class_weight={0:0.06, 1: 0.94}, criterion='entropy', max_depth=3, min_samples_leaf=50)
dc.fit(X_train, y_train)
y_pred = dc.predict(X_test)
printScores(y_test, y_pred)
# check area under curve
y_pred_prob = dc.predict_proba(X_test)[:, 1]
print("AUC: ", round(roc_auc_score(y_test, y_pred_prob),2))
PCA with Logistic Regression
Sensitivity : 0.7590667311411993
Specificity : 0.8669410150891632
AUC: 0.89
PCA with Random Forest
Sensitivity : 0.8740328820116054
Specificity : 0.7366255144032922
AUC: 0.88
PCA with Decision Classifier
Sensitivity : 0.7920696324951644
Specificity : 0.8024691358024691
AUC: 0.88
Logistic Regression
Accuracy : 0.8415961015904517
Recall/Sensitivity : 0.8539719626168224
Overall, the PCA with Random Forest is the best for the above scenario as it has the best sensitivity and comparable accuracy for the given business objective.
Let us now design a model to get the most important predictors for churn problem and let us visualize them. We would use decision tree for identifying the most important predictors as it is very easy to identify the most important predictors by visualizing the decision tree.
# Class imbalance fix
dc = DecisionTreeClassifier(class_weight={0:0.06, 1: 0.94}, max_depth=3, max_features='auto', min_samples_leaf=50)
# create pipeline
steps = [("scaler", StandardScaler()),
("dc", dc)]
# compile pipeline
pipeline = Pipeline(steps)
# fit model
pipeline.fit(X_train, y_train)
# check score on train data
pipeline.score(X_train, y_train)
# predict churn on test data
y_pred = pipeline.predict(X_test)
printScores(y_test, y_pred)
# check area under curve
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
print("AUC:", round(roc_auc_score(y_test, y_pred_prob),2))
# Class imbalance fix
dc = DecisionTreeClassifier(class_weight={0:0.06, 1: 0.94})
# create pipeline
steps = [("scaler", StandardScaler()),
("dc", dc)
]
# compile pipeline
pipeline_dc = Pipeline(steps)
# hyperparameter space
params = {"dc__criterion": ['gini', 'entropy'], "dc__max_features": ['auto', 'log2', 0.4], "dc__max_depth": [2, 3, 4], "dc__min_samples_leaf" : [20, 30, 40, 50, 100]}
# create 5 folds
folds = 5
# create gridsearch object
model = GridSearchCV(estimator=pipeline_dc, cv=folds, param_grid=params, scoring='roc_auc', return_train_score=True, n_jobs=-1, verbose=1)
# fit model
model.fit(X_train, y_train)
# Final hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)
# predict churn on test data
dc = DecisionTreeClassifier(class_weight={0:0.06, 1: 0.94}, criterion='entropy', max_depth=4, min_samples_leaf=30, max_features=0.4)
dc.fit(X_train, y_train)
y_pred = dc.predict(X_test)
printScores(y_test, y_pred)
# check area under curve
y_pred_prob = dc.predict_proba(X_test)[:, 1]
print("AUC: ", round(roc_auc_score(y_test, y_pred_prob),2))
For the ease and to prevent any errors, the following code is commented, but if the graphviz is installed and the path is specified, then the following code can be run. For ease just attaching the output of this code as the form of image.
# # Importing required packages for visualization
# from IPython.display import Image
# from sklearn.externals.six import StringIO
# from sklearn.tree import export_graphviz
# import pydotplus, graphviz
# # Putting features
# features = list(telecom_filtered.columns)
# features.remove('churn')
# # If you're on windows:
# # Specifing path for dot file.
# import os
# os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\bin'
# # plotting tree with max_depth=4
# dot_data = StringIO()
# export_graphviz(dc, out_file=dot_data,
# feature_names=features, filled=True,rounded=True)
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# graph.write_pdf("dt_churn.pdf")
In decision tree, the overall entropy at each node is decreased, based on which feature importance is calculated as we have used entropy in calculating DT, so starting from the root shows the decresing importance of each feature.
loc_ic_mou_8, roam_og_mou_8 & vol_3g_mb_8 are some of the most important features. The company should focus on the above parameters in the action phase can help company to provide offers to such customers to retain the probable churn customers. They can provide offers related to the data usage and roaming outgoing call plans so that they improve the usage and the customers do not leave the company.
The company needs to focus on the STD and ISD rates. Perhaps, the rates are too high. Provide them with some kind of STD and ISD packages.
To look into both of the issues stated above, it is desired that the telecom company collects customer query and complaint data and work on their services according to the needs of customers.